{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 1. Description<font>\n",
    "\n",
    "Sentiment classification using Rotten Tomatoes (Movie) review dataset (binary classification).\n",
    "Dataset can be downloaded from https://drive.google.com/file/d/1w1TsJB-gmIkZ28d1j7sf1sqcPmHXw352/view\n",
    "Please dowonload the data manually with your browser and store it to `datasets` directory.\n",
    "    \n",
    "The Rotten Tomatoes movie review dataset is a corpus of movie reviews used for sentiment analysis. We will classify a review to be positive ('fresh') or negative ('rotten') on the basis of review text.\n",
    "Using this dataset, we train a classifier to predict movie rating based on the review text."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 2. Data Preprocessing<font>\n",
    "\n",
    "For RT review classification we will perform some data preparation and data cleaning steps. We will generate feature vectors using sklearn TF-IDF for review text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import time\n",
    "import pandas as pd\n",
    "from collections import OrderedDict\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_embed(x_train, x_test):\n",
    "    '''\n",
    "    We will generate feature vectors using sklearn TF-IDF for review text.\n",
    "    '''\n",
    "    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "    count_vect = CountVectorizer()\n",
    "    x_train_counts = count_vect.fit_transform(x_train)\n",
    "    x_test_counts = count_vect.transform(x_test)\n",
    "\n",
    "    tfidf_transformer = TfidfTransformer()\n",
    "    x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)\n",
    "    x_test_tfidf = tfidf_transformer.transform(x_test_counts)\n",
    "    return x_train_tfidf, x_test_tfidf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_data(fname):\n",
    "    '''\n",
    "    For RT review classification we will perform some data preparation and data cleaning steps.\n",
    "    '''\n",
    "    df = pd.read_csv(fname, encoding=\"ISO-8859-1\")\n",
    "    df = df.dropna().drop_duplicates().sample(frac=1)\n",
    "    print(\"Dataset contains {} reviews\".format(df.shape[0]))\n",
    "    mapping = {'fresh': 1, 'rotten': 0}\n",
    "    df['Freshness'] = df.replace({'Freshness': mapping})\n",
    "    \n",
    "    from sklearn.model_selection import train_test_split\n",
    "    x_train, x_test, y_train, y_test = train_test_split(df[\"Review\"],\n",
    "                                                        df[\"Freshness\"],\n",
    "                                                        random_state = 42)\n",
    "    \n",
    "    x_train, x_test = create_embed(x_train, x_test)\n",
    "    y_train = y_train.to_numpy(dtype='int64')\n",
    "    y_test = y_test.to_numpy(dtype='int64')\n",
    "    return x_train, x_test, y_train, y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset contains 339716 reviews\n",
      "shape of train data: (254787, 77722)\n",
      "shape of test data: (84929, 77722)\n"
     ]
    }
   ],
   "source": [
    "#---- Data Preparation ----\n",
    "\n",
    "DATA_FILE = \"datasets/rt_reviews.csv\"\n",
    "x_train, x_test, y_train, y_test = preprocess_data(DATA_FILE)\n",
    "print(\"shape of train data: {}\".format(x_train.shape))\n",
    "print(\"shape of test data: {}\".format(x_test.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 3. Algorithm Evaluation<font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_time = []\n",
    "test_time = []\n",
    "accuracy = []\n",
    "precision = []\n",
    "recall = []\n",
    "f1 = []\n",
    "estimator_name = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(estimator, estimator_nm, \n",
    "             x_train, y_train,\n",
    "             x_test, y_test):\n",
    "    '''\n",
    "    To generate performance report for both frovedis and sklearn estimators\n",
    "    '''\n",
    "    estimator_name.append(estimator_nm)\n",
    "    start_time = time.time()\n",
    "    estimator.fit(x_train, list(y_train))\n",
    "    train_time.append(round(time.time() - start_time, 4))\n",
    "\n",
    "    start_time = time.time()\n",
    "    pred_y = estimator.predict(x_test)\n",
    "    test_time.append(round(time.time() - start_time, 4))\n",
    "\n",
    "    accuracy.append(metrics.accuracy_score(list(y_test), list(pred_y)))\n",
    "    precision.append(metrics.precision_score(list(y_test), list(pred_y), average='macro'))\n",
    "    recall.append(metrics.recall_score(list(y_test), list(pred_y), average='macro'))\n",
    "    f1.append(metrics.f1_score(list(y_test), list(pred_y), average='macro'))\n",
    "\n",
    "    return metrics.classification_report(list(y_test), list(pred_y))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.1 Binary LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Frovedis LogisticRegression metrices: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.78      0.74      0.76     37695\n",
      "           1       0.80      0.83      0.81     47234\n",
      "\n",
      "    accuracy                           0.79     84929\n",
      "   macro avg       0.79      0.79      0.79     84929\n",
      "weighted avg       0.79      0.79      0.79     84929\n",
      "\n",
      "Sklearn LogisticRegression metrices: \n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.72      0.72      0.72     37695\n",
      "           1       0.78      0.78      0.78     47234\n",
      "\n",
      "    accuracy                           0.75     84929\n",
      "   macro avg       0.75      0.75      0.75     84929\n",
      "weighted avg       0.75      0.75      0.75     84929\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import frovedis\n",
    "TARGET = \"binary_logistic_regression_sag\"\n",
    "from frovedis.exrpc.server import FrovedisServer\n",
    "FrovedisServer.initialize(\"mpirun -np 8 \" + os.environ[\"FROVEDIS_SERVER\"])\n",
    "from frovedis.mllib.linear_model import LogisticRegression as frovLogisticRegression\n",
    "f_est = frovLogisticRegression(solver='sag', multi_class='auto',\n",
    "                               max_iter=3500, penalty='none', \\\n",
    "                               lr_rate=0.005)\n",
    "E_NM = TARGET + \"_frovedis_\" + frovedis.__version__\n",
    "f_report = evaluate(f_est, E_NM, \\\n",
    "                    x_train, y_train, x_test, y_test)\n",
    "f_est.release()\n",
    "FrovedisServer.shut_down()\n",
    "\n",
    "import sklearn\n",
    "from sklearn.linear_model import LogisticRegression as skLogisticRegression\n",
    "s_est = skLogisticRegression(solver ='sag', multi_class = 'auto', \n",
    "                             max_iter=3500, penalty='none', \\\n",
    "                             n_jobs = 12)\n",
    "E_NM = TARGET + \"_sklearn_\" + sklearn.__version__\n",
    "s_report = evaluate(s_est, E_NM, \\\n",
    "                    x_train, y_train, x_test, y_test)\n",
    "\n",
    "print(\"Frovedis LogisticRegression metrices: \")\n",
    "print(f_report)\n",
    "print(\"Sklearn LogisticRegression metrices: \")\n",
    "print(s_report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### <font color='green'> 4. Performance summary<font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>estimator</th>\n",
       "      <th>train time</th>\n",
       "      <th>test time</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "      <th>f1-score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>binary_logistic_regression_sag_frovedis_0.9.10</td>\n",
       "      <td>3.0587</td>\n",
       "      <td>0.0727</td>\n",
       "      <td>0.790119</td>\n",
       "      <td>0.788250</td>\n",
       "      <td>0.785167</td>\n",
       "      <td>0.786381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>binary_logistic_regression_sag_sklearn_0.24.1</td>\n",
       "      <td>407.8879</td>\n",
       "      <td>0.0076</td>\n",
       "      <td>0.753100</td>\n",
       "      <td>0.749969</td>\n",
       "      <td>0.749389</td>\n",
       "      <td>0.749661</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        estimator  train time  test time  \\\n",
       "0  binary_logistic_regression_sag_frovedis_0.9.10      3.0587     0.0727   \n",
       "1   binary_logistic_regression_sag_sklearn_0.24.1    407.8879     0.0076   \n",
       "\n",
       "   accuracy  precision    recall  f1-score  \n",
       "0  0.790119   0.788250  0.785167  0.786381  \n",
       "1  0.753100   0.749969  0.749389  0.749661  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ---- evaluation summary ----\n",
    "summary = pd.DataFrame(OrderedDict({ \"estimator\": estimator_name,\n",
    "                                     \"train time\": train_time,\n",
    "                                     \"test time\": test_time,\n",
    "                                     \"accuracy\": accuracy,\n",
    "                                     \"precision\": precision,\n",
    "                                     \"recall\": recall,\n",
    "                                     \"f1-score\": f1\n",
    "                                  }))\n",
    "summary"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
